This notebook is designed to create a model that predicts whether a cell is infected with malaria or not. The dataset used is the Malaria Cell Images Dataset from Kaggle. The dataset contains 27,558 cell images with equal instances of parasitized and uninfected cells. The images are of different sizes.
# Import libraries and set up source directories
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image, ImageOps
from random import randint
from skimage.transform import resize
from skimage import io, img_as_float, color
import seaborn as sns
import concurrent.futures
import gc
root = "../data"
labels = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
datapath = []
for label in labels:
datapath.append(os.path.join(root, label))
datapath
['../data/Parasitized', '../data/Uninfected']
We have 2 classes of images: Uninfected and Parasitized. We will use the following encoding for them:
Utility Section
def load_image(path):
"""Wrapper function for skimage.io.imread parallelization
The image is returned as a numpy array with shape (M, N, 3) and dtype uint8."""
return io.imread(path, pilmode="RGB")
def load_image_bw(path):
"""Wrapper function for skimage.io.imread parallelization
The image is returned as a numpy array with shape (M, N) and dtype uint8."""
return io.imread(path, pilmode="L")
"""Load the images in parallel using concurrent.futures and store them in different lists both in color and in black and white."""
# We use concurrent.futures to speed up the process, since it is a CPU-bound task
uninf_path = [os.path.join(datapath[1], _) for _ in os.listdir(datapath[1])]
para_path = [os.path.join(datapath[0], _) for _ in os.listdir(datapath[0])]
with concurrent.futures.ProcessPoolExecutor() as executor:
uninf_imgs = executor.map(load_image, uninf_path)
para_imgs = executor.map(load_image, para_path)
# convert the map object to a list
uninf_imgs = list(uninf_imgs)
para_imgs = list(para_imgs)
with concurrent.futures.ProcessPoolExecutor() as executor:
uninf_imgs_bw = executor.map(load_image_bw, uninf_path)
para_imgs_bw = executor.map(load_image_bw, para_path)
uninf_imgs_bw = list(uninf_imgs_bw)
para_imgs_bw = list(para_imgs_bw)
# store rgb channels in separate lists for later use
r_u_list = [img[:, :, 0] for img in uninf_imgs]
g_u_list = [img[:, :, 1] for img in uninf_imgs]
b_u_list = [img[:, :, 2] for img in uninf_imgs]
r_p_list = [img[:, :, 0] for img in para_imgs]
g_p_list = [img[:, :, 1] for img in para_imgs]
b_p_list = [img[:, :, 2] for img in para_imgs]
Display sample images
We want to see some images to get a feel for the data and to make sure that the data is loaded correctly. We also want to see if there is something detectable by simply looking at the images before we start training a model.
Using images as dataset means that we have one variable per pixel, and each image is a point in a very high-dimensional space. Single variable distributions and pairwise correlations are completely meaningless and unmanageable.
# select 10 random integers - we will use them to select images along this initial process
uninf_index = [randint(0, len(os.listdir(datapath[1]))) for _ in range(10)]
para_index = [randint(0, len(os.listdir(datapath[0]))) for _ in range(10)]
# plot the images
fig, ax = plt.subplots(2, 10, figsize=(30, 5))
fig.suptitle("Uninfected and Parasitized Cells", fontsize=16)
for i in range(10):
ax[0, i].imshow(uninf_imgs[uninf_index[i]])
ax[1, i].imshow(para_imgs[para_index[i]])
ax[0, 0].set_ylabel("Uninfected", fontsize=14)
ax[1, 0].set_ylabel("Parasitized", fontsize=14)
plt.show()
# plot the same images but in grayscale
fig, ax = plt.subplots(2, 10, figsize=(30, 5))
fig.suptitle("Uninfected and Parasitized Cells (grayscale)", fontsize=16)
for i in range(10):
ax[0, i].imshow(uninf_imgs_bw[uninf_index[i]], cmap="gray")
ax[1, i].imshow(para_imgs_bw[para_index[i]], cmap="gray")
ax[0, 0].set_ylabel("Uninfected", fontsize=14)
ax[1, 0].set_ylabel("Parasitized", fontsize=14)
plt.show()
Inspecting the images we can see that:
We will use the grayscale images in the following steps, but we will keep the RGB images for future use.
Create data structures that will hold the images and their labels
We are going to preprocess the images in order to make them more suitable for the model. Using the grayscale images we will:
Note: This process is also useful to reduce computation time when training the model.
"""
Preprocessing functions to transform an image to be used in the model.
We will use other functions later like sklearn scalers.
"""
def remove_background(image):
"""Remove the black background of the image. The function takes the image and the threshold value. If the pixel is below the threshold, it is set to 0.
Args:
image (np.ndarray): image to be processed loaded with skimage.io.imread
"""
return np.where(image == 0.0, np.max(image), image)
def make_dataset(image, size=48):
"""Preprocessing function to transform an image to be used in the model. The steps are:
1. Remove the black background
2. Resize the image to the desired size
We remove the black background before resizing the image to avoid having black borders in the resized image due to the interpolation maked by the resize function.
The image is then flattened to be used in the model.
Args:
image (np.ndarray): image to be processed loaded with skimage.io.imread
size (int): desired size of the image, the image will be resized to (size, size)
"""
img = remove_background(image)
img = resize(img, (size, size), anti_aliasing=True)
return np.asarray(img).flatten()
# make a first preprocessing step to remove the black background and resize the images so that they have the same size and can be used as input for the model
with concurrent.futures.ProcessPoolExecutor() as executor:
u = np.asarray(list(executor.map(make_dataset, uninf_imgs_bw)))
p = np.asarray(list(executor.map(make_dataset, para_imgs_bw)))
del (
datapath,
label,
labels,
uninf_path,
para_path,
uninf_imgs,
para_imgs,
uninf_imgs_bw,
para_imgs_bw,
ax,
fig,
i,
executor,
)
gc.collect()
# create matrix of features from the numpy vectors
X = np.concatenate((u, p))
# create matrix of labels from the numpy vectors length
y = np.concatenate((np.zeros(len(u)), np.ones(len(p)))).reshape(-1, 1)
# pandas dataframe
df = pd.DataFrame(data=X, columns=range(np.shape(X)[1])).merge(
pd.DataFrame(data=y, columns=["labels"]), left_index=True, right_index=True
)
# df.head()
# store the dataframe in a csv file
# df.to_csv(os.path.join(root,"data.csv"), index=False)
Simple check on the distribution of the labels in the training and test sets. This will lead to see that our dataset is balanced.
# Count the number of uninfected and infected cells
plt.figure(figsize=(5, 2), dpi=100)
plt.title("Number of infected (1) and uninfected (0) cells")
sns.countplot(y=df["labels"], palette="Set2")
print(
"Is the dataset balanced? ",
"Yes" if df["labels"].value_counts()[0] == df["labels"].value_counts()[1] else "No",
)
Is the dataset balanced? Yes
Split the dataset into Training and Test sets
During the training phase we will use K-Fold Cross Validation to evaluate the model and tune its hyperparameters.
from sklearn.model_selection import train_test_split
# Split the data into training validation and test sets
test_ratio = 0.3
# now make two splits:
X_train_val, X_test, y_train_val, y_test = train_test_split(
df.drop(columns=["labels"]),
df["labels"],
test_size=test_ratio,
shuffle=True,
stratify=y,
random_state=42,
)
Using images, our dataset has a lot of features, so we will try to understand if model reduction techniques can help us. We will use PCA to reduce the dimensionality because we have only 2 classes and a supervised model is not necessary.
If the explained variance is not enough to explain the data, we need to use a more complex model. Probably a neural network.
About LDA: we don't use LDA as a dimensionality reduction method because with just 2 classes the output of the reduction will have only 1 dimension.
PCA
# Calculate the explained variance ratio to determine the number of components needed to explain 90% of the variance
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9)
pca.fit(X_train_val)
fig = plt.figure(figsize=(15, 5))
plt.suptitle("Cumulative Explained variance Ratio")
plt.xlabel(f"Number of components - 90% at {pca.components_.shape[0]}")
plt.ylabel("Variance(%)")
plt.plot(
range(len(pca.explained_variance_ratio_)),
pca.explained_variance_ratio_.cumsum(),
"o-",
color="orange",
)
plt.axhline(y=0.9, color="black", linestyle="--")
plt.axvline(x=pca.components_.shape[0], color="black", linestyle="--")
plt.grid()
print(f"Explained variance ratio: {pca.explained_variance_ratio_.cumsum()[-1]:.2f}")
del pca, fig
gc.collect()
Explained variance ratio: 0.90
10
Analysis: Using 50 components we can explain 90% of the variance, so we can use this number as a threshold to reduce the dimensionality of the dataset from 2304 to 50.
Scaling the dataset
We will scale the dataset with two different techniques that produced better results during the analysis: StandardScaler and RobustScaler. The former will scale the data to have zero mean and unit variance, while the latter will set the mean to zero (and unit variance if enabled) according to the interquartile (25-75%) range.
Note: the std scaler is designed to reduce the effect of outliers in the data. If our model will perform better with the std that means that outliers are present in the data.
# Prepare the data for the model
from sklearn.preprocessing import StandardScaler, RobustScaler
# initialize the scalers
robust_scale = RobustScaler().fit(X_train_val)
std_scale = StandardScaler().fit(X_train_val)
# fit and transform the data
x_robust = robust_scale.transform(X_train_val)
x_std = std_scale.transform(X_train_val)
# plot samples of the original, robust and standard scaled images
plt.figure(figsize=(15, 5), dpi=100)
plt.suptitle("Original vs Robust vs Standard Scaled Images")
plt.subplot(1, 3, 1)
plt.title("Original")
plt.imshow(X_train_val.iloc[1500].values.reshape(48, 48), cmap="gray")
plt.subplot(1, 3, 2)
plt.title("Robust")
plt.imshow(x_robust[1500].reshape(48, 48), cmap="gray")
plt.subplot(1, 3, 3)
plt.title("Standard")
plt.imshow(x_std[1500].reshape(48, 48), cmap="gray")
<matplotlib.image.AxesImage at 0x7f1860a044f0>
We will use simple models to test the dataset and to understand if we can get a good accuracy with a simple model. Then we will try to optimize the best scoring model searching for the best hyperparameters.
Note: we will show Confusion Matrix, Precision-Recall, and ROC Curve for better combination of each model.
from sklearn.metrics import (
RocCurveDisplay,
confusion_matrix,
ConfusionMatrixDisplay,
PrecisionRecallDisplay,
RocCurveDisplay,
precision_recall_curve,
roc_curve,
classification_report,
accuracy_score,
f1_score,
roc_auc_score,
recall_score,
precision_score,
)
def get_scores(model, X_train, y_train, X_test, y_test, display=False, name=None):
"""
Function to calculate the accuracy, precision, recall and f1 score of a model.
The confusion matrix is calculated and plotted if display is set to True. In this case, we can also see the accuracy of the model
because we are in a binary classification problem. The order of values in the confusion matrix is: [TN, FP, FN, TP]
Plot for the precision-recall curve and the ROC curve are also plotted if display is set to True.
- High precision values indicate that the model is not labeling many samples as positive that are actually negative.
- High recall values indicate that the model is labeling many samples as positive that are actually positive.
- Higher is the area under the ROC curve, better is the separation between the two classes.
Args:
model (sklearn model): model to be evaluated. Must implement predict and score methods.
X (np.ndarray): training data
y (np.ndarray): training labels
X_test (np.ndarray): test data
y_test (np.ndarray): test labels
"""
if display:
assert name is not None, "You must provide a name for the plot"
fitted = model.fit(X_train, y_train)
y_pred = fitted.predict(X_test)
y_proba = (
fitted.predict_proba(X_test)
if hasattr(fitted, "predict_proba")
else fitted.predict(X_test)
)
y_score = (
fitted.decision_function(X_test)
if hasattr(fitted, "decision_function")
else y_proba[:, 1]
)
cm = confusion_matrix(y_test, y_pred, labels=fitted.classes_, normalize="true")
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
precision, recall, _ = precision_recall_curve(y_test, y_score)
fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1] if hasattr(fitted, "predict_proba") else y_proba)
auc = roc_auc_score(y_test, y_proba[:, 1] if hasattr(fitted, "predict_proba") else y_proba)
if display:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(
f"Model: {name} - Accuracy: {acc.mean():.2f} - F1: {f1.mean():.2f} - Recall: {recall.mean():.2f} - Precision: {precision.mean():.2f} - TPR: {tpr.mean():.2f} - FPR: {fpr.mean():.2f}"
)
cm_display = ConfusionMatrixDisplay(
confusion_matrix=cm, display_labels=fitted.classes_
).plot(ax=ax[0], colorbar=False)
cm_display.ax_.set_title("Confusion Matrix (Normalized)")
cm_display.ax_.set_ylabel("True label")
cm_display.ax_.set_xlabel("Predicted label")
pre_rec_display = PrecisionRecallDisplay(
precision=precision, recall=recall
).plot(ax=ax[1])
pre_rec_display.ax_.set_title("Precision-Recall Curve")
pre_rec_display.ax_.set_ylabel("Precision")
pre_rec_display.ax_.set_xlabel("Recall")
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax[2])
roc_display.ax_.set_title("ROC Curve - AUC: {:.3f}".format(auc))
roc_display.ax_.set_ylabel(f"True Positive Rate")
roc_display.ax_.set_xlabel(f"False Positive Rate")
scores = classification_report(
y_test, y_pred, labels=fitted.classes_, output_dict=True
)
keys = list(scores[str(float("0"))]) + ["AUC"]
index = list(scores.keys()) + ["ROC"]
df = pd.DataFrame(index=index, columns=keys)
for k in list(index):
if k == str(float("0")) or k == str(float("1")):
df.loc[k] = list(scores[k].values()) + [np.NaN]
elif k == "accuracy":
df.loc[k] = [np.NaN, np.NaN, scores[k], X_test.shape[0]] + [np.NaN]
elif k == "macro avg" or k == "weighted avg":
df.loc[k] = list(scores[k].values()) + [np.NaN]
else:
df.loc[k] = [np.NaN, np.NaN, np.NaN, np.NaN, auc]
return df, cm
"""Pipelines"""
# We will use pipelines to make the testing workflow easier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier,
AdaBoostClassifier,
HistGradientBoostingClassifier,
)
# define the models - parameters are selected looking at the documentation and the results of some tests (no optimization was done at the moment)
sgdc = SGDClassifier(
loss="modified_huber", penalty="elasticnet", max_iter=10000, n_jobs=-1
)
lsvc = LinearSVC(C=1e-4, dual=False, max_iter=10000)
lr = LogisticRegression(C=1e-4, n_jobs=-1, solver="saga", max_iter=10000, dual=False)
knc = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
rfc = RandomForestClassifier(
n_jobs=-1,
n_estimators=500,
max_depth=40,
min_samples_split=3,
min_samples_leaf=6,
criterion="entropy",
bootstrap=True,
)
ada = AdaBoostClassifier(
n_estimators=500,
learning_rate=0.1,
estimator=DecisionTreeClassifier(
max_depth=5, min_samples_leaf=6, max_features="log2", ccp_alpha=0.001
),
)
hgb = HistGradientBoostingClassifier(
learning_rate=0.05, max_iter=300, early_stopping=True
)
# define robust pipelines
robust_pipe_SGDC = make_pipeline(
RobustScaler(), Nystroem(n_components=1000, n_jobs=-1), sgdc
)
robust_pipe_LSVC = make_pipeline(RobustScaler(), lsvc)
robust_pipe_LR = make_pipeline(RobustScaler(), lr)
robust_pipe_KNN = make_pipeline(RobustScaler(), knc)
robust_pipe_RF = make_pipeline(RobustScaler(), rfc)
robust_pipe_ADA = make_pipeline(RobustScaler(), ada)
robust_pipe_HGB = make_pipeline(RobustScaler(), hgb)
# define std pipelines
std_pipe_SGDC = make_pipeline(
StandardScaler(), Nystroem(n_components=1000, n_jobs=-1), sgdc
)
std_pipe_LSVC = make_pipeline(StandardScaler(), lsvc)
std_pipe_LR = make_pipeline(StandardScaler(), lr)
std_pipe_KNN = make_pipeline(StandardScaler(), knc)
std_pipe_RF = make_pipeline(StandardScaler(), rfc)
std_pipe_ADA = make_pipeline(StandardScaler(), ada)
std_pipe_HGB = make_pipeline(StandardScaler(), hgb)
# define robust pipelines with PCA
r_pca_pipe_SGDC = make_pipeline(
RobustScaler(), Nystroem(n_components=1000, n_jobs=-1), PCA(n_components=0.9), sgdc
)
r_pca_pipe_LSVC = make_pipeline(RobustScaler(), PCA(n_components=0.9), lsvc)
r_pca_pipe_LR = make_pipeline(RobustScaler(), PCA(n_components=0.9), lr)
r_pca_pipe_KNN = make_pipeline(RobustScaler(), PCA(n_components=0.9), knc)
r_pca_pipe_RF = make_pipeline(RobustScaler(), PCA(n_components=0.9), rfc)
r_pca_pipe_ADA = make_pipeline(RobustScaler(), PCA(n_components=0.9), ada)
r_pca_pipe_HGB = make_pipeline(RobustScaler(), PCA(n_components=0.9), hgb)
# define std pipelines with PCA
s_pca_pipe_LSVC = make_pipeline(StandardScaler(), PCA(n_components=0.9), lsvc)
s_pca_pipe_LR = make_pipeline(StandardScaler(), PCA(n_components=0.9), lr)
s_pca_pipe_KNN = make_pipeline(StandardScaler(), PCA(n_components=0.9), knc)
s_pca_pipe_RF = make_pipeline(StandardScaler(), PCA(n_components=0.9), rfc)
s_pca_pipe_ADA = make_pipeline(StandardScaler(), PCA(n_components=0.9), ada)
s_pca_pipe_HGB = make_pipeline(StandardScaler(), PCA(n_components=0.9), hgb)
s_pca_pipe_SGDC = make_pipeline(
StandardScaler(),
Nystroem(n_components=1000, n_jobs=-1),
PCA(n_components=0.9),
sgdc,
)
Stochastic Gradient Descent
Classifier that implements the Stoachastic Gradient Descent algorithm. SGD
r_score_sgd, r_cm_sgd = get_scores(
robust_pipe_SGDC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="SGDC Robust",
)
r_score_sgd
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.82062 | 0.903 | 0.859841 | 4134 | NaN |
| 1.0 | 0.892175 | 0.802612 | 0.845027 | 4134 | NaN |
| accuracy | NaN | NaN | 0.852806 | 8268 | NaN |
| macro avg | 0.856398 | 0.852806 | 0.852434 | 8268 | NaN |
| weighted avg | 0.856398 | 0.852806 | 0.852434 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.925115 |
rp_score_sgd, rp_cm_sgd = get_scores(
r_pca_pipe_SGDC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="SGDC Robust with PCA",
)
rp_score_sgd
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.775967 | 0.863812 | 0.817537 | 4134 | NaN |
| 1.0 | 0.846427 | 0.750605 | 0.795641 | 4134 | NaN |
| accuracy | NaN | NaN | 0.807209 | 8268 | NaN |
| macro avg | 0.811197 | 0.807209 | 0.806589 | 8268 | NaN |
| weighted avg | 0.811197 | 0.807209 | 0.806589 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.883602 |
s_score_sgd, s_cm_sgd = get_scores(
std_pipe_SGDC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="SGDC Standard",
)
s_score_sgd
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.829056 | 0.898645 | 0.862449 | 4134 | NaN |
| 1.0 | 0.889358 | 0.814707 | 0.850398 | 4134 | NaN |
| accuracy | NaN | NaN | 0.856676 | 8268 | NaN |
| macro avg | 0.859207 | 0.856676 | 0.856423 | 8268 | NaN |
| weighted avg | 0.859207 | 0.856676 | 0.856423 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.926266 |
sp_score_sgd, sp_cm_sgd = get_scores(
s_pca_pipe_SGDC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="SGDC Standard with PCA",
)
sp_score_sgd
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.830415 | 0.837446 | 0.833915 | 4134 | NaN |
| 1.0 | 0.836058 | 0.828979 | 0.832503 | 4134 | NaN |
| accuracy | NaN | NaN | 0.833212 | 8268 | NaN |
| macro avg | 0.833236 | 0.833212 | 0.833209 | 8268 | NaN |
| weighted avg | 0.833236 | 0.833212 | 0.833209 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.894365 |
Linear Support Vector Classification
Supervised learning model for classification. It's an optimized implementation of the Support Vector Machine algorithm for the case of a linear kernel. It is useful in high dimensional spaces like images.
r_score_lsvc, r_cm_lsvc = get_scores(
robust_pipe_LSVC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LSVC Robust",
)
r_score_lsvc
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.675772 | 0.736091 | 0.704643 | 4134 | NaN |
| 1.0 | 0.710226 | 0.646831 | 0.677048 | 4134 | NaN |
| accuracy | NaN | NaN | 0.691461 | 8268 | NaN |
| macro avg | 0.692999 | 0.691461 | 0.690845 | 8268 | NaN |
| weighted avg | 0.692999 | 0.691461 | 0.690845 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.691461 |
rp_score_lsvc, rp_cm_lsvc = get_scores(
r_pca_pipe_LSVC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LSVC Robust with PCA",
)
rp_score_lsvc
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.662748 | 0.736333 | 0.697605 | 4134 | NaN |
| 1.0 | 0.703401 | 0.625302 | 0.662057 | 4134 | NaN |
| accuracy | NaN | NaN | 0.680818 | 8268 | NaN |
| macro avg | 0.683075 | 0.680818 | 0.679831 | 8268 | NaN |
| weighted avg | 0.683075 | 0.680818 | 0.679831 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.680818 |
s_score_lsvc, s_cm_lsvc = get_scores(
std_pipe_LSVC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LSVC Standard",
)
s_score_lsvc
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.673197 | 0.75641 | 0.712382 | 4134 | NaN |
| 1.0 | 0.722054 | 0.632801 | 0.674488 | 4134 | NaN |
| accuracy | NaN | NaN | 0.694606 | 8268 | NaN |
| macro avg | 0.697625 | 0.694606 | 0.693435 | 8268 | NaN |
| weighted avg | 0.697625 | 0.694606 | 0.693435 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.694606 |
sp_score_lsvc, sp_cm_lsvc = get_scores(
s_pca_pipe_LSVC,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LSVC Standard with PCA",
)
sp_score_lsvc
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.665283 | 0.736575 | 0.699116 | 4134 | NaN |
| 1.0 | 0.704958 | 0.629415 | 0.665048 | 4134 | NaN |
| accuracy | NaN | NaN | 0.682995 | 8268 | NaN |
| macro avg | 0.68512 | 0.682995 | 0.682082 | 8268 | NaN |
| weighted avg | 0.68512 | 0.682995 | 0.682082 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.682995 |
Logistic Regression
Supervised learning algorithm. It is used to predict a binary outcome (1 / 0, Yes / No, True / False) given a set of independent variables. It is a special case of linear regression where the dependent variable is categorical in nature. It is used to estimate the probability of a certain class or event existing such as pass/fail, win/lose, alive/dead or healthy/sick.
r_score_lr, r_cm_lr = get_scores(
robust_pipe_LR,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LR Robust",
)
r_score_lr
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.669401 | 0.730285 | 0.698519 | 4134 | NaN |
| 1.0 | 0.7033 | 0.639332 | 0.669792 | 4134 | NaN |
| accuracy | NaN | NaN | 0.684809 | 8268 | NaN |
| macro avg | 0.68635 | 0.684809 | 0.684156 | 8268 | NaN |
| weighted avg | 0.68635 | 0.684809 | 0.684156 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.74248 |
rp_score_lr, rp_cm_lr = get_scores(
r_pca_pipe_LR,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LR Robust with PCA",
)
rp_score_lr
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.663279 | 0.726173 | 0.693303 | 4134 | NaN |
| 1.0 | 0.697488 | 0.63135 | 0.662773 | 4134 | NaN |
| accuracy | NaN | NaN | 0.678761 | 8268 | NaN |
| macro avg | 0.680383 | 0.678761 | 0.678038 | 8268 | NaN |
| weighted avg | 0.680383 | 0.678761 | 0.678038 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.73213 |
s_score_lr, s_cm_lr = get_scores(
std_pipe_LR,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LR Standard",
)
s_score_lr
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.67006 | 0.732463 | 0.699873 | 4134 | NaN |
| 1.0 | 0.704988 | 0.639332 | 0.670557 | 4134 | NaN |
| accuracy | NaN | NaN | 0.685897 | 8268 | NaN |
| macro avg | 0.687524 | 0.685897 | 0.685215 | 8268 | NaN |
| weighted avg | 0.687524 | 0.685897 | 0.685215 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.743441 |
sp_score_lr, sp_cm_lr = get_scores(
s_pca_pipe_LR,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="LR Standard with PCA",
)
sp_score_lr
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.664014 | 0.726657 | 0.693925 | 4134 | NaN |
| 1.0 | 0.698184 | 0.632317 | 0.66362 | 4134 | NaN |
| accuracy | NaN | NaN | 0.679487 | 8268 | NaN |
| macro avg | 0.681099 | 0.679487 | 0.678772 | 8268 | NaN |
| weighted avg | 0.681099 | 0.679487 | 0.678772 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.732451 |
K-Nearest Neighbors
Supervised learning algorithm based on the k-nearest neighbors principle. it does not attempt to construct a general internal model, but simply stores instances of the training data. Classification is computed from a simple majority vote of the nearest neighbors of each point: a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.
r_score_knn, r_cm_knn = get_scores(
robust_pipe_KNN,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="KNN Robust",
)
r_score_knn
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.538959 | 0.987179 | 0.697249 | 4134 | NaN |
| 1.0 | 0.923851 | 0.155539 | 0.266253 | 4134 | NaN |
| accuracy | NaN | NaN | 0.571359 | 8268 | NaN |
| macro avg | 0.731405 | 0.571359 | 0.481751 | 8268 | NaN |
| weighted avg | 0.731405 | 0.571359 | 0.481751 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.672756 |
rp_score_knn, rp_cm_knn = get_scores(
r_pca_pipe_KNN,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="KNN Robust with PCA",
)
rp_score_knn
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.591583 | 0.945331 | 0.727747 | 4134 | NaN |
| 1.0 | 0.864019 | 0.347363 | 0.495514 | 4134 | NaN |
| accuracy | NaN | NaN | 0.646347 | 8268 | NaN |
| macro avg | 0.727801 | 0.646347 | 0.61163 | 8268 | NaN |
| weighted avg | 0.727801 | 0.646347 | 0.61163 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.752117 |
s_score_knn, s_cm_knn = get_scores(
std_pipe_KNN,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="KNN Standard",
)
s_score_knn
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.536971 | 0.985486 | 0.695163 | 4134 | NaN |
| 1.0 | 0.911894 | 0.150218 | 0.257944 | 4134 | NaN |
| accuracy | NaN | NaN | 0.567852 | 8268 | NaN |
| macro avg | 0.724433 | 0.567852 | 0.476553 | 8268 | NaN |
| weighted avg | 0.724433 | 0.567852 | 0.476553 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.670381 |
sp_score_knn, sp_cm_knn = get_scores(
s_pca_pipe_KNN,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="KNN Standard with PCA",
)
sp_score_knn
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.5888 | 0.948718 | 0.726633 | 4134 | NaN |
| 1.0 | 0.868077 | 0.337446 | 0.485978 | 4134 | NaN |
| accuracy | NaN | NaN | 0.643082 | 8268 | NaN |
| macro avg | 0.728439 | 0.643082 | 0.606305 | 8268 | NaN |
| weighted avg | 0.728439 | 0.643082 | 0.606305 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.750459 |
Random Forest
Tree-based model that uses a number of decision trees to predict the outcome of a data point with the purpose of reducing the error rate of the model trading off variance for bias to reduce the overfitting.
r_score_rf, r_cm_rf = get_scores(
robust_pipe_RF,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="RF Robust",
)
r_score_rf
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.834585 | 0.833575 | 0.83408 | 4134 | NaN |
| 1.0 | 0.833776 | 0.834785 | 0.83428 | 4134 | NaN |
| accuracy | NaN | NaN | 0.83418 | 8268 | NaN |
| macro avg | 0.83418 | 0.83418 | 0.83418 | 8268 | NaN |
| weighted avg | 0.83418 | 0.83418 | 0.83418 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.917757 |
rp_score_rf, rp_cm_rf = get_scores(
r_pca_pipe_RF,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="RF Robust with PCA",
)
rp_score_rf
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.923014 | 0.814949 | 0.865622 | 4134 | NaN |
| 1.0 | 0.834344 | 0.932027 | 0.880484 | 4134 | NaN |
| accuracy | NaN | NaN | 0.873488 | 8268 | NaN |
| macro avg | 0.878679 | 0.873488 | 0.873053 | 8268 | NaN |
| weighted avg | 0.878679 | 0.873488 | 0.873053 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.94451 |
s_score_rf, s_cm_rf = get_scores(
std_pipe_RF,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="RF Standard",
)
s_score_rf
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.833979 | 0.833575 | 0.833777 | 4134 | NaN |
| 1.0 | 0.833656 | 0.834059 | 0.833857 | 4134 | NaN |
| accuracy | NaN | NaN | 0.833817 | 8268 | NaN |
| macro avg | 0.833817 | 0.833817 | 0.833817 | 8268 | NaN |
| weighted avg | 0.833817 | 0.833817 | 0.833817 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.917297 |
sp_score_rf, sp_cm_rf = get_scores(
s_pca_pipe_RF,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="RF Standard with PCA",
)
sp_score_rf
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.927368 | 0.812288 | 0.866022 | 4134 | NaN |
| 1.0 | 0.833011 | 0.936381 | 0.881676 | 4134 | NaN |
| accuracy | NaN | NaN | 0.874335 | 8268 | NaN |
| macro avg | 0.880189 | 0.874335 | 0.873849 | 8268 | NaN |
| weighted avg | 0.880189 | 0.874335 | 0.873849 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.947669 |
AdaBoost
Model that fit a sequence of weak learners (models that are slightly better than random guessing) on repeatedly modified versions of the data. The predictions from all of them are then combined through a weighted majority vote (or sum) to produce the final prediction.
r_score_ada, r_cm_ada = get_scores(
robust_pipe_ADA,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="ADA Robust",
)
r_score_ada
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.806314 | 0.957668 | 0.875498 | 4134 | NaN |
| 1.0 | 0.947886 | 0.769956 | 0.849706 | 4134 | NaN |
| accuracy | NaN | NaN | 0.863812 | 8268 | NaN |
| macro avg | 0.8771 | 0.863812 | 0.862602 | 8268 | NaN |
| weighted avg | 0.8771 | 0.863812 | 0.862602 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.950024 |
rp_score_ada, rp_cm_ada = get_scores(
r_pca_pipe_ADA,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="ADA Robust with PCA",
)
rp_score_ada
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.890813 | 0.858491 | 0.874353 | 4134 | NaN |
| 1.0 | 0.863445 | 0.894775 | 0.878831 | 4134 | NaN |
| accuracy | NaN | NaN | 0.876633 | 8268 | NaN |
| macro avg | 0.877129 | 0.876633 | 0.876592 | 8268 | NaN |
| weighted avg | 0.877129 | 0.876633 | 0.876592 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.943245 |
s_score_ada, s_cm_ada = get_scores(
std_pipe_ADA,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="ADA Standard",
)
s_score_ada
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.803222 | 0.95283 | 0.871653 | 4134 | NaN |
| 1.0 | 0.942033 | 0.76657 | 0.845292 | 4134 | NaN |
| accuracy | NaN | NaN | 0.8597 | 8268 | NaN |
| macro avg | 0.872628 | 0.8597 | 0.858473 | 8268 | NaN |
| weighted avg | 0.872628 | 0.8597 | 0.858473 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.945569 |
sp_score_ada, sp_cm_ada = get_scores(
s_pca_pipe_ADA,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="ADA Standard with PCA",
)
sp_score_ada
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.892688 | 0.865264 | 0.878762 | 4134 | NaN |
| 1.0 | 0.86928 | 0.895985 | 0.88243 | 4134 | NaN |
| accuracy | NaN | NaN | 0.880624 | 8268 | NaN |
| macro avg | 0.880984 | 0.880624 | 0.880596 | 8268 | NaN |
| weighted avg | 0.880984 | 0.880624 | 0.880596 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.94585 |
Hist Gradient (tree) Boosting
Optimized implementation of gradient boosting decision trees. This estimators first bin the input samples into integer-valued bins, reducing the number of splits to build the trees. The speed-up cames from the fact that the algorithm uses integer-based data structures to represent the binned data, which is faster than using the original floating-point values.
r_score_hgb, r_cm_hgb = get_scores(
robust_pipe_HGB,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="HGB Robust",
)
r_score_hgb
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.887115 | 0.954282 | 0.919473 | 4134 | NaN |
| 1.0 | 0.950537 | 0.878568 | 0.913136 | 4134 | NaN |
| accuracy | NaN | NaN | 0.916425 | 8268 | NaN |
| macro avg | 0.918826 | 0.916425 | 0.916305 | 8268 | NaN |
| weighted avg | 0.918826 | 0.916425 | 0.916305 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.972771 |
rp_score_hgb, rp_cm_hgb = get_scores(
r_pca_pipe_HGB,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="HGB Robust with PCA",
)
rp_score_hgb
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.912733 | 0.865264 | 0.888365 | 4134 | NaN |
| 1.0 | 0.871925 | 0.917271 | 0.894023 | 4134 | NaN |
| accuracy | NaN | NaN | 0.891268 | 8268 | NaN |
| macro avg | 0.892329 | 0.891268 | 0.891194 | 8268 | NaN |
| weighted avg | 0.892329 | 0.891268 | 0.891194 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.952724 |
s_score_hgb, s_cm_hgb = get_scores(
std_pipe_HGB,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="HGB Standard",
)
s_score_hgb
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.889739 | 0.956459 | 0.921893 | 4134 | NaN |
| 1.0 | 0.952929 | 0.881471 | 0.915808 | 4134 | NaN |
| accuracy | NaN | NaN | 0.918965 | 8268 | NaN |
| macro avg | 0.921334 | 0.918965 | 0.918851 | 8268 | NaN |
| weighted avg | 0.921334 | 0.918965 | 0.918851 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.97313 |
sp_score_hgb, sp_cm_hgb = get_scores(
s_pca_pipe_HGB,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="HGB Standard with PCA",
)
sp_score_hgb
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.911151 | 0.865747 | 0.887869 | 4134 | NaN |
| 1.0 | 0.87212 | 0.915578 | 0.893321 | 4134 | NaN |
| accuracy | NaN | NaN | 0.890663 | 8268 | NaN |
| macro avg | 0.891635 | 0.890663 | 0.890595 | 8268 | NaN |
| weighted avg | 0.891635 | 0.890663 | 0.890595 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.95634 |
Looking at the f1-score accuracy of these models we can see that:
Looking at the ROC Curve we can see that a lot of models have a good performance, but the best one is the Histogram based Gradient Boosting classifier. It has a good AUC and a good Precision-Recall curve.
We will use the Histogram based Gradient Boosting classifier in the optimization phase.
define the study and the objective function, then run the optimization.
# clean up memory
del (
u,
p,
sgdc,
lsvc,
lr,
knc,
rfc,
ada,
x_robust,
x_std,
robust_pipe_SGDC,
robust_pipe_LSVC,
robust_pipe_LR,
robust_pipe_KNN,
robust_pipe_RF,
robust_pipe_ADA,
std_pipe_SGDC,
std_pipe_LSVC,
std_pipe_LR,
std_pipe_KNN,
std_pipe_RF,
std_pipe_ADA,
std_pipe_HGB,
r_pca_pipe_SGDC,
r_pca_pipe_LSVC,
r_pca_pipe_LR,
r_pca_pipe_KNN,
r_pca_pipe_RF,
r_pca_pipe_ADA,
r_pca_pipe_HGB,
s_pca_pipe_SGDC,
s_pca_pipe_LSVC,
s_pca_pipe_LR,
s_pca_pipe_KNN,
s_pca_pipe_RF,
s_pca_pipe_ADA,
s_pca_pipe_HGB,
r_score_sgd,
r_cm_sgd,
# rp_score_sgd, rp_cm_sgd, s_score_sgd, s_cm_sgd, sp_score_sgd, sp_cm_sgd,
# r_score_lsvc, r_cm_lsvc, rp_score_lsvc, rp_cm_lsvc,
s_score_lsvc,
s_cm_lsvc,
# sp_score_lsvc, sp_cm_lsvc,
# r_score_lr, r_cm_lr, rp_score_lr, rp_cm_lr,
s_score_lr,
s_cm_lr,
# sp_score_lr, sp_cm_lr,
# r_score_knn, r_cm_knn,
rp_score_knn,
rp_cm_knn,
# s_score_knn, s_cm_knn, sp_score_knn, sp_cm_knn,
# r_score_rf, r_cm_rf, rp_score_rf, rp_cm_rf, s_score_rf, s_cm_rf,
sp_score_rf,
sp_cm_rf,
# r_score_ada, r_cm_ada,
rp_score_ada,
rp_cm_ada,
# s_score_ada, s_cm_ada, sp_score_ada, sp_cm_ada,
r_score_hgb,
r_cm_hgb,
# rp_score_hgb, rp_cm_hgb, s_score_hgb, s_cm_hgb, sp_score_hgb, sp_cm_hgb
)
gc.collect()
8780
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import optuna
def objective(trial, X, y):
# define scaler
scaler = RobustScaler()
# define the model
estimator = HistGradientBoostingClassifier(
learning_rate=trial.suggest_float("learning_rate", 0.001, 1, log=True),
l2_regularization=trial.suggest_float("l2_regularization", 1e-10, 10, log=True),
max_iter=trial.suggest_int("max_iter", 100, 500, step=20),
max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 20, 100),
max_depth=trial.suggest_int("max_depth", 400, 500, step=10),
max_bins=trial.suggest_int("max_bins", 100, 255),
min_samples_leaf=trial.suggest_int("min_samples_leaf", 20, 30, step=2),
early_stopping=trial.suggest_categorical("early_stopping", [True]),
)
# instantiate the cross validation object
skf = StratifiedKFold(n_splits=5, shuffle=True)
# define the preprocessing pipeline
pipeline = make_pipeline(scaler, estimator)
score = cross_val_score(pipeline, X, y, scoring="roc_auc", n_jobs=-1, cv=skf)
# calculate the mean of scores
return score.mean()
best_params = None
# ! this takes a while to run
# J = lambda trial: objective(trial, X_train_val, y_train_val)
# study = optuna.create_study(direction="maximize", study_name="HGB", sampler=optuna.samplers.TPESampler(seed=42), pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
# study.optimize(J, n_trials=10, gc_after_trial=True)
# best_params = study.best_params
Best Param with 10 trials
One run of the previous cell gives this output:

We can get the best parameters obtained with 10 trials and plot the optimization history and the importance factor of each parameter we considered in the optimization. We can see that the most important parameters are the learning rate, the number of iterations and the number of bins. We could try to optimize these parameters with a bigger number of trials but the score is so high that further improvement is not necessary and it will take a lot of time.
Note: the parameters that we found, in fact, doesn't improve too much the score respect to the ones used in the previous phase.
if best_params is None:
best_params = {
"learning_rate": 0.07068619051755831,
"l2_regularization": 0.00022185517620643103,
"max_iter": 260,
"max_leaf_nodes": 80,
"max_depth": 410,
"max_bins": 127,
"min_samples_leaf": 26,
"early_stopping": True,
}
model = make_pipeline(
RobustScaler(unit_variance=True), HistGradientBoostingClassifier(**best_params)
)
s, c = get_scores(
model,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name="HGB Best Params",
)
s
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.910437 | 0.951621 | 0.930574 | 4134 | NaN |
| 1.0 | 0.949329 | 0.906386 | 0.92736 | 4134 | NaN |
| accuracy | NaN | NaN | 0.929003 | 8268 | NaN |
| macro avg | 0.929883 | 0.929003 | 0.928967 | 8268 | NaN |
| weighted avg | 0.929883 | 0.929003 | 0.928967 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.975804 |
We can see that the final model has a good performance and that in the 93% of the cases the model is able to predict the correct class. In fact the ROC-AUC is 0.97/1. The precision and recall curve are also very good but in the context of medical diagnosis we would like to have a better recall. In our case, on average, the model is able to predict an infected cell with a 90% of probability. This is a good result but we could try to make some 'a posteriori' analysis to understand if we can improve the recall on the infected class.
Try with altered classification threshold: consider the probability of the predicted class as a score and use a threshold to classify the cell as infected or not. We want to maximize the recall on the infected class without loosing too much precision on the healthy class.
# Plot the density of the predicted probabilities
proba = model.predict_proba(X_test)
_, ax = plt.subplots(1, 2, figsize=(10, 3))
sns.kdeplot(proba[:, 1], fill=True, ax=ax[0])
ax[0].set_title("Density of the predicted probabilities")
ax[1].hist(proba[:, 1], bins=20)
ax[1].set_title("Histogram of the predicted probabilities")
Text(0.5, 1.0, 'Histogram of the predicted probabilities')
In this density plot we can see that we have much higher probability to make correct guesses, meaning that we have very few cases where the model has some uncertainty about the class of a cell. Also, looking at the histogram of probabilities we can see that our values are very close to 0 or 1, so the threshold should be very aggrressive to improve the recall.
# try to plot recall score vs lower bound
# we are trying to alter every probability between 0.1 and 0.5
ub = 0.51
lb = np.linspace(0.1, 0.5, 10)
pred = proba[:, 1]
base_recall = recall_score(y_test, (pred > 0.5).astype(int))
recall = np.asarray(base_recall)
new_prob_list = np.zeros((len(lb), len(pred)))
# ! change the average params to see how it affects the recall score
average = "macro"
for bound in lb:
new_prob = pred.copy()
indexes = (pred >= bound) & (pred <= max(lb))
new_prob[indexes] = 1 - pred[indexes]
new_prob_list[np.where(lb == bound)[0][0]] = new_prob
y_pred = (new_prob > 0.5).astype(int)
recall = np.append(recall, recall_score(y_test, y_pred, average=average))
_, ax = plt.subplots(1, 2, figsize=(10, 3))
ax[0].plot(lb, recall[1:], "o-")
ax[0].hlines(base_recall, min(lb), max(lb), linestyles="dashed")
ax[0].hlines(max(recall), min(lb), max(lb), linestyles="dashed", colors="red")
ax[0].legend(
[
f"Recall_{average}",
"Base Recall",
f"Max Recall at {lb[np.argmax(recall[1:])]:.3f}",
]
)
ax[0].set_title("Recall vs Lower Bound")
ax[1].hist(new_prob_list[np.argmax(recall[1:])], bins=20)
ax[1].set_title("Histogram for altered probabilities at max recall")
Text(0.5, 1.0, 'Histogram for altered probabilities at max recall')
Setting the threshold for the infected class lower than 0.5 lead to higher recall, but this means also that the precision will slowly decrease. Optimizing the threshold could help us to find a good trade-off
# Define objective function for Optuna
def best_threshold(trial, pred, pred_proba, average):
# Define the lower bound of the threshold - the upper bound is 0.5
lower_bound = trial.suggest_float("lower_bound", 0.1, 0.5, log=True)
# get the indexes of the probabilities that are between the lower bound and 0.5
indexes = (pred_proba[:, 1] >= lower_bound) & (pred_proba[:, 1] <= 0.5)
# apply the threshold to the probabilities
probs_t = pred_proba[:, 1].copy()
# simply flip the probabilities when indexes are True
probs_t[indexes] = 1 - probs_t[indexes]
# map the probabilities to 0 or 1
predictions = (probs_t > 0.5).astype(int)
# calculate the recall score
recall = recall_score(y_true=pred, y_pred=predictions, average=average)
return recall
# define the study
recall_study = optuna.create_study(
direction="maximize",
study_name="recall",
sampler=optuna.samplers.TPESampler(),
pruner=optuna.pruners.MedianPruner(),
)
J = lambda trial: best_threshold(trial, y_test.values, proba, average)
recall_study.optimize(J, n_trials=150, gc_after_trial=True)
# optuna.visualization.matplotlib.plot_optimization_history(recall_study)
recall_study.best_params
[I 2023-05-31 15:11:16,757] A new study created in memory with name: recall [I 2023-05-31 15:11:16,764] Trial 0 finished with value: 0.9326318335752298 and parameters: {'lower_bound': 0.44775917277038935}. Best is trial 0 with value: 0.9326318335752298. [I 2023-05-31 15:11:16,867] Trial 1 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.28021177184627394}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:16,973] Trial 2 finished with value: 0.9052975326560232 and parameters: {'lower_bound': 0.13061942732621745}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,083] Trial 3 finished with value: 0.9288824383164005 and parameters: {'lower_bound': 0.2236104099050449}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,188] Trial 4 finished with value: 0.8986453797774553 and parameters: {'lower_bound': 0.11998112194844147}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,291] Trial 5 finished with value: 0.9263425253991292 and parameters: {'lower_bound': 0.2056882198812947}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,398] Trial 6 finished with value: 0.9150943396226414 and parameters: {'lower_bound': 0.16046124391266034}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,503] Trial 7 finished with value: 0.8981615868408321 and parameters: {'lower_bound': 0.11858202277208511}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,613] Trial 8 finished with value: 0.9290033865505564 and parameters: {'lower_bound': 0.49797460185620895}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,719] Trial 9 finished with value: 0.9339622641509434 and parameters: {'lower_bound': 0.2904039916401189}. Best is trial 1 with value: 0.9340832123850992. [I 2023-05-31 15:11:17,830] Trial 10 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.3293954511412525}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:17,940] Trial 11 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.3295182444088689}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,050] Trial 12 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.35854659006478423}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,159] Trial 13 finished with value: 0.9343251088534108 and parameters: {'lower_bound': 0.3716380761097377}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,271] Trial 14 finished with value: 0.9345670053217223 and parameters: {'lower_bound': 0.316185165123328}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,445] Trial 15 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.4082937849912237}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,590] Trial 16 finished with value: 0.9316642477019835 and parameters: {'lower_bound': 0.2517486976464147}. Best is trial 10 with value: 0.9351717464925011. [I 2023-05-31 15:11:18,703] Trial 17 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.34411330906864557}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:18,812] Trial 18 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.3962950945097248}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:18,919] Trial 19 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.3298327497798462}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,027] Trial 20 finished with value: 0.932510885341074 and parameters: {'lower_bound': 0.26445711826524243}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,139] Trial 21 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.33192679962193145}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,252] Trial 22 finished with value: 0.9329946782776972 and parameters: {'lower_bound': 0.42112854496986457}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,362] Trial 23 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3529101506312646}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,472] Trial 24 finished with value: 0.9339622641509434 and parameters: {'lower_bound': 0.30123036294218114}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,587] Trial 25 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.37270291642640824}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,697] Trial 26 finished with value: 0.9310595065312046 and parameters: {'lower_bound': 0.4633518215147294}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,806] Trial 27 finished with value: 0.9345670053217223 and parameters: {'lower_bound': 0.3139737521054523}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:19,916] Trial 28 finished with value: 0.9317851959361394 and parameters: {'lower_bound': 0.24877807544520936}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,024] Trial 29 finished with value: 0.933115626511853 and parameters: {'lower_bound': 0.4365739891736017}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,134] Trial 30 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.4019751697274758}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,242] Trial 31 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.3395221547156287}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,352] Trial 32 finished with value: 0.933599419448476 and parameters: {'lower_bound': 0.2883553574134663}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,462] Trial 33 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3390280001930688}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,578] Trial 34 finished with value: 0.9332365747460087 and parameters: {'lower_bound': 0.2752782516787483}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,689] Trial 35 finished with value: 0.9348089017900338 and parameters: {'lower_bound': 0.3165975150388991}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,798] Trial 36 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.3789756494156335}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:20,908] Trial 37 finished with value: 0.9311804547653604 and parameters: {'lower_bound': 0.23832183353209715}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,015] Trial 38 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.28144136368464656}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,123] Trial 39 finished with value: 0.9270682148040639 and parameters: {'lower_bound': 0.2109627917728289}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,231] Trial 40 finished with value: 0.9294871794871795 and parameters: {'lower_bound': 0.4742679959110949}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,339] Trial 41 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.32775895376217157}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,446] Trial 42 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.3453874968628749}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,557] Trial 43 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.30000888983012197}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,672] Trial 44 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.36027692343685525}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,781] Trial 45 finished with value: 0.9327527818093856 and parameters: {'lower_bound': 0.4440129937523572}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:21,893] Trial 46 finished with value: 0.9339622641509434 and parameters: {'lower_bound': 0.382838859207002}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,004] Trial 47 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.3503846402656715}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,113] Trial 48 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.41390782702710227}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,222] Trial 49 finished with value: 0.9338413159167875 and parameters: {'lower_bound': 0.30498476104276984}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,330] Trial 50 finished with value: 0.9322689888727624 and parameters: {'lower_bound': 0.2664666177793133}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,438] Trial 51 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.33259045584135233}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,546] Trial 52 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.3271300345070826}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,658] Trial 53 finished with value: 0.9344460570875666 and parameters: {'lower_bound': 0.39016692976899503}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,768] Trial 54 finished with value: 0.9343251088534108 and parameters: {'lower_bound': 0.3563157481130373}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,876] Trial 55 finished with value: 0.9339622641509434 and parameters: {'lower_bound': 0.29024569658884164}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:22,984] Trial 56 finished with value: 0.9343251088534107 and parameters: {'lower_bound': 0.3130426648858653}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:23,096] Trial 57 finished with value: 0.9346879535558781 and parameters: {'lower_bound': 0.3677817565680831}. Best is trial 17 with value: 0.9351717464925012. [I 2023-05-31 15:11:23,206] Trial 58 finished with value: 0.9355345911949685 and parameters: {'lower_bound': 0.33592059195403656}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,314] Trial 59 finished with value: 0.933115626511853 and parameters: {'lower_bound': 0.41910997147619095}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,425] Trial 60 finished with value: 0.935292694726657 and parameters: {'lower_bound': 0.3449145016831073}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,535] Trial 61 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.346878239917013}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,649] Trial 62 finished with value: 0.9339622641509433 and parameters: {'lower_bound': 0.3735762637628573}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,759] Trial 63 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.3481317842779544}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,868] Trial 64 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.34825496591563093}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:23,978] Trial 65 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.3464288674540611}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,085] Trial 66 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.3940974962789801}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,219] Trial 67 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.34952563118859487}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,405] Trial 68 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.2998617921940665}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,517] Trial 69 finished with value: 0.9332365747460087 and parameters: {'lower_bound': 0.43079035092084916}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,634] Trial 70 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.3963604110402907}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,745] Trial 71 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.35232015314054427}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,856] Trial 72 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.34192719385582926}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:24,967] Trial 73 finished with value: 0.9346879535558781 and parameters: {'lower_bound': 0.36917833654933624}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,078] Trial 74 finished with value: 0.935292694726657 and parameters: {'lower_bound': 0.3187896647501645}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,189] Trial 75 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.31980846420650283}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,300] Trial 76 finished with value: 0.9344460570875666 and parameters: {'lower_bound': 0.31230940559688664}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,412] Trial 77 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.411803410262496}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,528] Trial 78 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.3816530224148868}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,731] Trial 79 finished with value: 0.935292694726657 and parameters: {'lower_bound': 0.33049056040263913}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,854] Trial 80 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.32571565869631813}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:25,967] Trial 81 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3416341817100331}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,152] Trial 82 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.35788369464155817}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,312] Trial 83 finished with value: 0.9338413159167877 and parameters: {'lower_bound': 0.29847088467138927}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,437] Trial 84 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.336830972673495}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,645] Trial 85 finished with value: 0.9338413159167875 and parameters: {'lower_bound': 0.28320623238604026}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,764] Trial 86 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.33053285295309015}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,878] Trial 87 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.3087886802890866}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:26,991] Trial 88 finished with value: 0.9346879535558781 and parameters: {'lower_bound': 0.3685844731037277}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,131] Trial 89 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.31944463548461377}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,240] Trial 90 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.38450950477588464}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,360] Trial 91 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.3344369610033063}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,551] Trial 92 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3376535611794714}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,683] Trial 93 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.33424242499436013}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,797] Trial 94 finished with value: 0.9338413159167875 and parameters: {'lower_bound': 0.2954288370451707}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:27,912] Trial 95 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.3073942436730549}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,027] Trial 96 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.32243284378284076}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,215] Trial 97 finished with value: 0.9346879535558781 and parameters: {'lower_bound': 0.3663086011648416}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,391] Trial 98 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.33668227550837154}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,507] Trial 99 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.3346052842186657}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,628] Trial 100 finished with value: 0.9328737300435413 and parameters: {'lower_bound': 0.27425409368683545}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,742] Trial 101 finished with value: 0.9355345911949685 and parameters: {'lower_bound': 0.33598116436631115}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,855] Trial 102 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.32973809301148876}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:28,968] Trial 103 finished with value: 0.9345670053217223 and parameters: {'lower_bound': 0.31527103894100905}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,081] Trial 104 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.336871478948051}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,192] Trial 105 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.35857143344459974}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,305] Trial 106 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3321022646425612}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,418] Trial 107 finished with value: 0.9338413159167875 and parameters: {'lower_bound': 0.3040346108390207}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,531] Trial 108 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.33791525918315696}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,648] Trial 109 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.2896839546842996}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,759] Trial 110 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.3192879809270603}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,874] Trial 111 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.36047803146126983}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:29,985] Trial 112 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.3791286945301645}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,097] Trial 113 finished with value: 0.9355345911949685 and parameters: {'lower_bound': 0.33613848517133726}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,208] Trial 114 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3378235826695}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,318] Trial 115 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.30926166033646385}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,431] Trial 116 finished with value: 0.9351717464925011 and parameters: {'lower_bound': 0.3240800004490623}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,542] Trial 117 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.36101668533881226}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,660] Trial 118 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.3959154602239855}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,773] Trial 119 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.2999097688979966}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,885] Trial 120 finished with value: 0.9355345911949685 and parameters: {'lower_bound': 0.33576782852564385}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:30,998] Trial 121 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.3346276714567442}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,108] Trial 122 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3497149448929717}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,217] Trial 123 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.3399598841990863}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,328] Trial 124 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.3770660293848741}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,436] Trial 125 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.31834196027680645}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,549] Trial 126 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.3584915614728749}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,661] Trial 127 finished with value: 0.9339622641509434 and parameters: {'lower_bound': 0.2902400656806151}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,770] Trial 128 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.3335398172214635}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,882] Trial 129 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.33681925940900637}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:31,990] Trial 130 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.31015722990210504}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,102] Trial 131 finished with value: 0.9355345911949685 and parameters: {'lower_bound': 0.33493730780521785}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,212] Trial 132 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.33369162454333845}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,323] Trial 133 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.325523804282677}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,434] Trial 134 finished with value: 0.9344460570875666 and parameters: {'lower_bound': 0.37095041836672266}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,547] Trial 135 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3529155649037351}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,661] Trial 136 finished with value: 0.9349298500241896 and parameters: {'lower_bound': 0.3407866557485493}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,771] Trial 137 finished with value: 0.934204160619255 and parameters: {'lower_bound': 0.3098686214617039}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,882] Trial 138 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.40270876346612083}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:32,991] Trial 139 finished with value: 0.935292694726657 and parameters: {'lower_bound': 0.3245137750709616}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,100] Trial 140 finished with value: 0.9351717464925012 and parameters: {'lower_bound': 0.3468218364738704}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,209] Trial 141 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3320789901369347}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,318] Trial 142 finished with value: 0.9354136429608128 and parameters: {'lower_bound': 0.3364530792867844}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,425] Trial 143 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.3526825481679896}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,534] Trial 144 finished with value: 0.9342041606192549 and parameters: {'lower_bound': 0.3631224076264585}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,650] Trial 145 finished with value: 0.9350507982583454 and parameters: {'lower_bound': 0.332039021966629}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,759] Trial 146 finished with value: 0.9337203676826318 and parameters: {'lower_bound': 0.29952890532116355}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,871] Trial 147 finished with value: 0.9345670053217223 and parameters: {'lower_bound': 0.3154202235236613}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:33,983] Trial 148 finished with value: 0.9340832123850992 and parameters: {'lower_bound': 0.3872359320316075}. Best is trial 58 with value: 0.9355345911949685. [I 2023-05-31 15:11:34,093] Trial 149 finished with value: 0.9346879535558781 and parameters: {'lower_bound': 0.3691196066645616}. Best is trial 58 with value: 0.9355345911949685.
{'lower_bound': 0.33592059195403656}
The metrics respect to this threshold are:
acc = accuracy_score(
y_test, (proba[:, 1] > recall_study.best_params["lower_bound"]).astype(int)
)
print(f"Accuracy score: {acc:.3f}")
prec = precision_score(
y_test, (proba[:, 1] > recall_study.best_params["lower_bound"]).astype(int)
)
print(f"Precision score: {prec:.3f}")
recall = recall_score(
y_test, (proba[:, 1] > recall_study.best_params["lower_bound"]).astype(int)
)
print(f"Recall score: {recall:.3f}")
cm = confusion_matrix(
y_test,
(proba[:, 1] > recall_study.best_params["lower_bound"]).astype(int),
normalize="true",
)
ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot(colorbar=False)
Accuracy score: 0.936 Precision score: 0.929 Recall score: 0.944
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f1853ee0f40>
Alter class weights: we can try to alter the class weights to give more importance to the infected class. This will lead to a better recall on the infected class but it will also increase the false positive rate on the healthy class.
# Optimize the class weight
best_weight = None
# Define objective function to spot the best class weight
def best_class_weight(trial, X_train, y_train, params):
weight_0 = trial.suggest_float("weight_0", 0.1, 0.5, log=True)
class_weight = {0: weight_0, 1: 1 - weight_0}
# define the model
model = make_pipeline(
RobustScaler(),
HistGradientBoostingClassifier(**params, class_weight=class_weight),
)
# define the cross validation object
skf = StratifiedKFold(n_splits=5, shuffle=True)
score = cross_val_score(
model, X_train, y_train, cv=skf, scoring="recall_macro", n_jobs=-1
)
return score.mean()
# ! this takes a while to run
# define the study
# weight_study = optuna.create_study(direction='maximize', study_name='class_weight', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner())
# J = lambda trial: best_class_weight(trial, X_train_val, y_train_val, best_params)
# weight_study.optimize(J, n_trials=10, gc_after_trial=True)
# best_weight = weight_study.best_params
Optimization Results:

# Test the altered model
# check if the parameter is optimized in this run or not to save time
if best_weight is None:
best_weight = 0.39781129987053776
altered_model = make_pipeline(
RobustScaler(),
HistGradientBoostingClassifier(
**best_params, class_weight={0: best_weight, 1: 1 - best_weight}
),
)
s, c = get_scores(
altered_model,
X_train_val,
y_train_val,
X_test,
y_test,
display=True,
name=f"HGB Best Altered [CW_0 = {best_weight:.3f}]",
)
s
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.920283 | 0.94388 | 0.931932 | 4134 | NaN |
| 1.0 | 0.942403 | 0.918239 | 0.930164 | 4134 | NaN |
| accuracy | NaN | NaN | 0.93106 | 8268 | NaN |
| macro avg | 0.931343 | 0.93106 | 0.931048 | 8268 | NaN |
| weighted avg | 0.931343 | 0.93106 | 0.931048 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.975726 |
Looking at the recall score for che infected class we can see a little improvement without loosing too much precision on the healthy class. The ROC-AUC is also very good. We could try to optimize the weights with a bigger number of trials but the score is so high that further improvement is not necessary and it will take a lot of time.
Try to use diffent view of the dataset: Looking at the RGB channels separately we could find that the infection is more visible in one of the channels and use that channel to train the model.
To do this we will use a model that is able to make a good score but with lesser training time: the Stochastic Gradient Descent classifier.
# clean up the memory again
del (
acc,
average,
ax,
base_recall,
best_params,
best_weight,
bound,
cm,
c,
df,
hgb,
indexes,
lb,
new_prob,
new_prob_list,
prec,
pred,
proba,
randint,
recall,
recall_study,
robust_scale,
root,
s,
std_scale,
test_ratio,
ub,
X,
X_train_val,
y_train_val,
)
gc.collect()
8760
# Make dataset for each color channel
with concurrent.futures.ProcessPoolExecutor() as executor:
r_u = np.asarray(list(executor.map(make_dataset, r_u_list)))
r_p = np.asarray(list(executor.map(make_dataset, r_p_list)))
g_u = np.asarray(list(executor.map(make_dataset, g_u_list)))
g_p = np.asarray(list(executor.map(make_dataset, g_p_list)))
b_u = np.asarray(list(executor.map(make_dataset, b_u_list)))
b_p = np.asarray(list(executor.map(make_dataset, b_p_list)))
X_R = np.concatenate((r_u, r_p))
X_G = np.concatenate((g_u, g_p))
X_B = np.concatenate((b_u, b_p))
del (r_u, r_p, g_u, g_p, b_u, b_p, r_u_list, r_p_list, g_u_list, g_p_list, executor)
gc.collect()
0
# make datasets and plot some samples
test_ratio = 0.3
df = pd.DataFrame(data=X_R, columns=range(np.shape(X_R)[1])).merge(
pd.DataFrame(data=y, columns=["labels"]), left_index=True, right_index=True
)
X_R_train, X_R_test, y_R_train, y_R_test = train_test_split(
df.drop("labels", axis=1),
df["labels"],
test_size=test_ratio,
shuffle=True,
stratify=y,
random_state=42,
)
df = pd.DataFrame(data=X_G, columns=range(np.shape(X_G)[1])).merge(
pd.DataFrame(data=y, columns=["labels"]), left_index=True, right_index=True
)
X_B_train, X_B_test, y_B_train, y_B_test = train_test_split(
df.drop("labels", axis=1),
df["labels"],
test_size=test_ratio,
shuffle=True,
stratify=y,
random_state=42,
)
df = pd.DataFrame(data=X_B, columns=range(np.shape(X_B)[1])).merge(
pd.DataFrame(data=y, columns=["labels"]), left_index=True, right_index=True
)
X_G_train, X_G_test, y_G_train, y_G_test = train_test_split(
df.drop("labels", axis=1),
df["labels"],
test_size=test_ratio,
shuffle=True,
stratify=y,
random_state=42,
)
fig, ax = plt.subplots(3, 10, figsize=(20, 6))
fig.suptitle("Uninfected and Parasitized Cells with RGB channels", fontsize=16)
for _ in df.index.values.tolist()[0:10]:
ax[0, _].imshow(X_R_train.iloc[_].values.reshape(48, 48), cmap="Reds")
ax[0, _].set_title(f"class: {y_R_train.iloc[_]}")
ax[0, _].axis("off")
ax[1, _].imshow(X_G_train.iloc[_].values.reshape(48, 48), cmap="Greens")
ax[1, _].set_title(f"class: {y_G_train.iloc[_]}")
ax[1, _].axis("off")
ax[2, _].imshow(X_B_train.iloc[_].values.reshape(48, 48), cmap="Blues")
ax[2, _].set_title(f"class: {y_B_train.iloc[_]}")
ax[2, _].axis("off")
plt.show()
Make tests without any filters
# define the pipeline again
pipe = make_pipeline(
RobustScaler(),
Nystroem(n_components=1000, n_jobs=-1),
SGDClassifier(
loss="modified_huber", penalty="elasticnet", max_iter=10000, n_jobs=-1
),
)
# Tests for Red channel
s_r, c_r = get_scores(
pipe, X_R_train, y_R_train, X_R_test, y_R_test, display=True, name="Red Channel"
)
s_r
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.73584 | 0.776246 | 0.755503 | 4134 | NaN |
| 1.0 | 0.763245 | 0.721335 | 0.741699 | 4134 | NaN |
| accuracy | NaN | NaN | 0.748791 | 8268 | NaN |
| macro avg | 0.749543 | 0.748791 | 0.748601 | 8268 | NaN |
| weighted avg | 0.749543 | 0.748791 | 0.748601 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.820783 |
# Tests for Green channel
s_g, c_g = get_scores(
pipe, X_G_train, y_G_train, X_G_test, y_G_test, display=True, name="Green Channel"
)
s_g
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.811821 | 0.740929 | 0.774757 | 4134 | NaN |
| 1.0 | 0.761735 | 0.828254 | 0.793603 | 4134 | NaN |
| accuracy | NaN | NaN | 0.784591 | 8268 | NaN |
| macro avg | 0.786778 | 0.784591 | 0.78418 | 8268 | NaN |
| weighted avg | 0.786778 | 0.784591 | 0.78418 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.859053 |
s_b, c_b = get_scores(
pipe, X_B_train, y_B_train, X_B_test, y_B_test, display=True, name="Blue Channel"
)
s_b
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.848505 | 0.926705 | 0.885883 | 4134 | NaN |
| 1.0 | 0.919265 | 0.834543 | 0.874857 | 4134 | NaN |
| accuracy | NaN | NaN | 0.880624 | 8268 | NaN |
| macro avg | 0.883885 | 0.880624 | 0.88037 | 8268 | NaN |
| weighted avg | 0.883885 | 0.880624 | 0.88037 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.94561 |
It seems that the Stochastic Gradient Descent model is able to make good predictions with the Blue Dataset and their are better than the ones made with the grayscale dataset (comparing the same model). In the next step we will try to use the some filters to see if we can improve the scores.
# Define support functions for the filters
from skimage import exposure
from skimage.filters import gaussian, meijering, sato, hessian
from skimage.restoration import denoise_tv_chambolle
def contrast_strength(img):
size = img.shape
return exposure.rescale_intensity(img.reshape(size[0], size[1])).reshape(
size[0] * size[1]
)
def denoise_tv(img):
size = img.shape
return denoise_tv_chambolle(img.reshape(size[0], size[1]), weight=0.1).reshape(
size[0] * size[1]
)
def gaussian_blur(img):
size = img.shape
return gaussian(img.reshape(size[0], size[1]), sigma=1, truncate=2).reshape(
size[0] * size[1]
)
def meijering_filter(img):
size = img.shape
return meijering(img.reshape(size[0], size[1]), range(1, 5)).reshape(
size[0] * size[1]
)
def sato_filter(img):
size = img.shape
return sato(img.reshape(size[0], size[1]), range(1, 5)).reshape(size[0] * size[1])
def equalize_adaptive(img):
size = img.shape
kernel_size = np.array((size[0] // 5, size[1] // 5))
clip_limit = 0.9
return exposure.equalize_adapthist(
img.reshape(size[0], size[1]), kernel_size=kernel_size, clip_limit=clip_limit
).reshape(size[0] * size[1])
def make_filtered_dataset(args):
"""same as make_dataset but with a filter applied"""
img, size, filter = args["img"], args["size"], args["filter"]
img = resize(img, (size, size), anti_aliasing=True)
img = remove_background(img)
if filter == "contrast_strength":
img = contrast_strength(img)
elif filter == "denoise_tv":
img = denoise_tv(img)
elif filter == "gaussian":
img = gaussian_blur(img)
elif filter == "meijering":
img = meijering_filter(img)
elif filter == "sato":
img = sato_filter(img)
elif filter == "adaptive":
img = equalize_adaptive(img)
return np.asarray(img).flatten()
# Show what the filters do to an image
img = X_B_train.iloc[0].values.reshape(48, 48)
fig, ax = plt.subplots(2, 4, figsize=(10, 5))
ax[0, 0].imshow(contrast_strength(img).reshape(48, 48), cmap="Greys")
ax[0, 0].set_title("contrast_strength")
ax[0, 0].axis("off")
ax[0, 1].imshow(gaussian_blur(img).reshape(48, 48), cmap="Greys")
ax[0, 1].set_title("gaussian_blur")
ax[0, 1].axis("off")
ax[1, 0].imshow(denoise_tv(img).reshape(48, 48), cmap="Greys")
ax[1, 0].set_title("denoise_tv")
ax[1, 0].axis("off")
ax[1, 1].imshow(meijering_filter(img).reshape(48, 48), cmap="Greys")
ax[1, 1].set_title("meijering_filter")
ax[1, 1].axis("off")
ax[0, 2].imshow(sato_filter(img).reshape(48, 48), cmap="Greys")
ax[0, 2].set_title("sato_filter")
ax[0, 2].axis("off")
ax[1, 2].imshow(equalize_adaptive(img).reshape(48, 48), cmap="Greys")
ax[1, 2].set_title("Adaptive Contrast Eq")
ax[1, 2].axis("off")
ax[0, 3].imshow(img.reshape(48, 48), cmap="Blues")
ax[0, 3].set_title("original Blue")
ax[0, 3].axis("off")
ax[1, 3].imshow(img.reshape(48, 48), cmap="Greys")
ax[1, 3].set_title("original Grey")
ax[1, 3].axis("off")
(-0.5, 47.5, 47.5, -0.5)
# f = "contrast_strength"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['contrast_strength'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="Contrast Strength")
# s_adjust
# f = "denoise_tv"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['denoise_tv'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="Denoise TV")
# s_adjust
# f = "gaussian"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['gaussian'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="gaussian blur")
# s_adjust
# f = "meijering"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['meijering'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="Meijering")
# s_adjust
# f = "sato"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['sato'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="Sato")
# s_adjust
# f = "adaptive"
# with concurrent.futures.ProcessPoolExecutor() as executor:
# arg = [{'img': img, 'size': size, 'filter': f} for img in b_list]
# filtered_set = np.asarray(list(executor.map(make_filtered_dataset, arg)))
# df_dict[f] = pd.DataFrame(filtered_set)
# X_b_train, X_b_test, y_b_train, y_b_test = train_test_split(df_dict['adaptive'], np.asarray(y).ravel(), test_size=0.3, random_state=42, stratify=y)
# s_adjust, _ = get_scores(pipe, X_b_train, y_b_train, X_b_test, y_b_test, display=True, name="Adaptive Contrast Eq")
# s_adjust
For the sake of time we will just present the results of the tests with the filters:
We saw that filters don't improve the score but as last test we can try to see what happen if we use the blue channel dataset to train the HGB model we found in the previous phase. To get comparable results we will use the pipeline defined before the optimization phase.
# Test the HGB model on the blue dataset
score_blue, _ = get_scores(
robust_pipe_HGB,
X_B_train,
y_B_train,
X_B_test,
y_B_test,
display=True,
name="HGB Blue",
)
score_blue
| precision | recall | f1-score | support | AUC | |
|---|---|---|---|---|---|
| 0.0 | 0.8945 | 0.959845 | 0.926021 | 4134 | NaN |
| 1.0 | 0.956681 | 0.886792 | 0.920412 | 4134 | NaN |
| accuracy | NaN | NaN | 0.923319 | 8268 | NaN |
| macro avg | 0.92559 | 0.923319 | 0.923216 | 8268 | NaN |
| weighted avg | 0.92559 | 0.923319 | 0.923216 | 8268 | NaN |
| ROC | NaN | NaN | NaN | NaN | 0.975396 |
The overall score seems to be the same but the recall is higher, so we can say that the blue channel dataset is better than the grayscale one. Probably the infection is more visible in the blue channel and this is the reason why the model is able to make better predictions. We could go further and try to use the blue channel to achieve better results with optimizations and altering the class weights but we will stop here.